Flight - Homework2 - Ngoc131112(Python)

Loading...
 
Year
Quarter
Month
DayofMonth
DayOfWeek
FlightDate
UniqueCarrier
AirlineID
Carrier
TailNum
FlightNum
OriginAirportID
OriginAirportSeqID
OriginCityMarketID
Origin
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
2018
1
1
16
2
2018-01-16
AA
19805
AA
N128AN
228
12892
1289206
32575
LAX
2018
1
1
17
3
2018-01-17
AA
19805
AA
N128AN
228
12892
1289206
32575
LAX
2018
1
1
18
4
2018-01-18
AA
19805
AA
N121AN
228
12892
1289206
32575
LAX
2018
1
1
19
5
2018-01-19
AA
19805
AA
N129AA
228
12892
1289206
32575
LAX
2018
1
1
20
6
2018-01-20
AA
19805
AA
N133AN
228
12892
1289206
32575
LAX
2018
1
1
21
7
2018-01-21
AA
19805
AA
N127AA
228
12892
1289206
32575
LAX
2018
1
1
22
1
2018-01-22
AA
19805
AA
N130AN
228
12892
1289206
32575
LAX
2018
1
1
23
2
2018-01-23
AA
19805
AA
N131NN
228
12892
1289206
32575
LAX
2018
1
1
24
3
2018-01-24
AA
19805
AA
N131NN
228
12892
1289206
32575
LAX
2018
1
1
25
4
2018-01-25
AA
19805
AA
N132AN
228
12892
1289206
32575
LAX
2018
1
1
26
5
2018-01-26
AA
19805
AA
N131NN
228
12892
1289206
32575
LAX
2018
1
1
27
6
2018-01-27
AA
19805
AA
N133AN
228
12892
1289206
32575
LAX
2018
1
1
28
7
2018-01-28
AA
19805
AA
N124AA
228
12892
1289206
32575
LAX
2018
1
1
29
1
2018-01-29
AA
19805
AA
N130AN
228
12892
1289206
32575
LAX
2018
1
1
30
2
2018-01-30
AA
19805
AA
N129AA
228
12892
1289206
32575
LAX
2018
1
1
31
3
2018-01-31
AA
19805
AA
N125AA
228
12892
1289206
32575
LAX
2018
1
1
1
1
2018-01-01
AA
19805
AA
N349AN
229
11298
1129804
30194
DFW
3,480 rows|Truncated data
 
IATA_CODE
AIRPORT
CITY
STATE
COUNTRY
LATITUDE
LONGITUDE
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
ABE
Lehigh Valley International Airport
Allentown
PA
USA
40.65236
-75.4404
ABI
Abilene Regional Airport
Abilene
TX
USA
32.41132
-99.6819
ABQ
Albuquerque International Sunport
Albuquerque
NM
USA
35.04022
-106.60919
ABR
Aberdeen Regional Airport
Aberdeen
SD
USA
45.44906
-98.42183
ABY
Southwest Georgia Regional Airport
Albany
GA
USA
31.53552
-84.19447
ACK
Nantucket Memorial Airport
Nantucket
MA
USA
41.25305
-70.06018
ACT
Waco Regional Airport
Waco
TX
USA
31.61129
-97.23052
ACV
Arcata Airport
Arcata/Eureka
CA
USA
40.97812
-124.10862
ACY
Atlantic City International Airport
Atlantic City
NJ
USA
39.45758
-74.57717
ADK
Adak Airport
Adak
AK
USA
51.87796
-176.64603
ADQ
Kodiak Airport
Kodiak
AK
USA
57.74997
-152.49386
AEX
Alexandria International Airport
Alexandria
LA
USA
31.32737
-92.54856
AGS
Augusta Regional Airport (Bush Field)
Augusta
GA
USA
33.36996
-81.9645
AKN
King Salmon Airport
King Salmon
AK
USA
58.6768
-156.64922
ALB
Albany International Airport
Albany
NY
USA
42.74812
-73.80298
ALO
Waterloo Regional Airport
Waterloo
IA
USA
42.55708
-92.40034
AMA
Rick Husband Amarillo International Airport
Amarillo
TX
USA
35.21937
-101.70593
322 rows
# File location and type
file_location = "/FileStore/tables/airlines.csv"
file_type = "csv"

# CSV options
infer_schema = "true"
first_row_is_header = "true"
delimiter = ","

# The applied options are for CSV files. For other file types, these will be ignored.
df_airlines = spark.read.format(file_type) \
  .option("inferSchema", infer_schema) \
  .option("header", first_row_is_header) \
  .option("sep", delimiter) \
  .load(file_location)

temp_table_name = "airlines"
df_airlines.createOrReplaceTempView(temp_table_name)
display(df_airlines)
 
IATA_CODE
AIRLINE
1
2
3
4
5
6
7
8
9
10
11
12
13
14
UA
United Air Lines Inc.
AA
American Airlines Inc.
US
US Airways Inc.
F9
Frontier Airlines Inc.
B6
JetBlue Airways
OO
Skywest Airlines Inc.
AS
Alaska Airlines Inc.
NK
Spirit Air Lines
WN
Southwest Airlines Co.
DL
Delta Air Lines Inc.
EV
Atlantic Southeast Airlines
HA
Hawaiian Airlines Inc.
MQ
American Eagle Airlines Inc.
VX
Virgin America
14 rows
%sql
--Find out which airports when treated as Origin have the smallest and the largest Departure delays.
select Origin, MIN(DepDelay) as MinDepartureDelay, MAX(DepDelay) as MaxDepartureDelay
from flights
group by Origin
order by MinDepartureDelay asc, MaxDepartureDelay desc
 
Origin
MinDepartureDelay
MaxDepartureDelay
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
YAK
-50
78
LAS
-49
804
DTW
-46
1332
RNO
-46
340
JMS
-45
191
PLN
-43
828
MEI
-42
1424
CDV
-42
26
KTN
-38
174
PSG
-37
136
SCC
-35
237
UIN
-35
142
IAG
-34
888
LEX
-34
466
RSW
-32
994
CAK
-32
931
BRW
-31
231
334 rows
%sql
--Create a list of U.S. States with the number of airports that each of them has in descending order. 
--Show this on a map of the U.S. (Hint: Use: a choropleth map. States are listed as USPS Abbreviations)

select State, count(*) as AirportCount
from airports
where State is not NULL
group by State
order by AirportCount desc
 
State
AirportCount
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
TX
24
CA
22
AK
19
FL
17
MI
15
NY
14
CO
10
MN
8
WI
8
MT
8
NC
8
ND
8
PA
8
LA
7
VA
7
IL
7
GA
7
54 rows
%sql
-- Create a list containing: DATE, NUM_OF_FLIGHTS, MAX_ARRIVAL_DELAY, AVG_ARRIVAL_DELAY. 
-- Please provide the date (from FlightDate) in the following format i.e. 2018-01-10. 
-- The maximum and Average delays should be provided in minutes but rounded to 2 digits after the decimal point.

select 
    to_date(FlightDate) as DATE,
    count(*) as NUM_OF_FLIGHTS,
    round(max(ArrDelay), 2) as MAX_ARRIVAL_DELAY,
    round(avg(ArrDelay), 2) as AVG_ARRIVAL_DELAY
from flights
group by to_date(FlightDate)
 
DATE
NUM_OF_FLIGHTS
MAX_ARRIVAL_DELAY
AVG_ARRIVAL_DELAY
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
2018-01-23
18282
1366
-0.94
2018-01-11
19417
2023
4.85
2018-01-18
19418
1009
1.49
2018-01-08
19287
1454
13.71
2018-01-14
16697
1123
-3.37
2018-01-05
19283
1316
12.43
2018-01-13
14725
1378
0.26
2018-01-16
18291
1076
4.34
2018-01-12
19577
1390
23.53
2018-01-20
14862
1432
-9.07
2018-01-25
19429
1015
-5.45
2018-01-29
19316
1151
-0.21
2018-01-19
19534
1210
-5.7
2018-01-04
19242
1486
2.32
2018-01-02
19881
1778
10.82
2018-01-17
18819
1343
23.88
2018-01-03
19193
1717
6.6
31 rows
%sql
-- Show on a bar chart the number of flights per Carrier. Use the full name of the airline (from airlines.csv).

select a.airline, count(*) as flightcount
from flights f
join airlines a on f.Carrier = a.IATA_CODE
group by airline
order by flightcount

 
airline
flightcount
1
2
3
4
5
6
7
8
9
10
11
12
13
Virgin America
5824
Hawaiian Airlines Inc.
6627
Frontier Airlines Inc.
9707
Spirit Air Lines
14180
Alaska Airlines Inc.
15312
Atlantic Southeast Airlines
20166
American Eagle Airlines Inc.
22502
JetBlue Airways
24871
United Air Lines Inc.
45384
Skywest Airlines Inc.
62207
Delta Air Lines Inc.
71254
American Airlines Inc.
73598
Southwest Airlines Co.
109676
13 rows
%sql
-- Show on a pie chart the number of flights per Carrier but this time only show the individual results for the top 10 Carriers (measured by number of flights) and the rest show as one OTHER element. 
-- This will probably require the use of Temporary Views (Create or Replace Temporary View new_view as Select * from XXX). 
-- If it is done correctly OTHER should account for 16% of all flights.

-- The number of flights per Carrier
create or replace temporary view flights_per_carrier as 
  select Carrier, count(*) as flightcount
  from flights
  group by Carrier;

-- Carriers ranking (measured by number of flights)
create or replace temporary view carriers_ranking as
  select Carrier, flightcount,
         rank() over (order by flightcount desc) as rank
  from flights_per_carrier;

-- Top 10 Carriers and group the rest as 'OTHER'
  select case
           when rank <= 10 then Carrier
           else 'OTHER'
         end as Carrier,
         SUM(flightcount) as flightcount
  from carriers_ranking
  group by case
             when rank <= 10 then Carrier
             else 'OTHER'
           end;

 
Carrier
flightcount
1
2
3
4
5
6
7
8
9
10
11
WN
109676
AA
73598
DL
71254
OO
62207
UA
45384
YX
25212
B6
24871
MQ
22502
OH
22210
EV
20166
OTHER
93051
11 rows
Beta
0 / 10used queries